import warnings
warnings.filterwarnings('ignore')
%config InlineBackend.figure_format = 'retina'
%pylab inline
import pysal as ps
import pandas as pd
import seaborn as sns
import geopandas as gpd
import mgwr
import scipy
import statsmodels.api as sm
from mgwr.gwr import GWR
from mgwr.sel_bw import Sel_BW
from scipy.stats import skew
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
from itertools import combinations as comb
from scipy import stats
from mpl_toolkits.axes_grid1 import make_axes_locatable
from mpl_toolkits.axes_grid1.inset_locator import inset_axes
from collections import Counter
from matplotlib.lines import Line2D
from sklearn import preprocessing
from sklearn.linear_model import LassoLarsIC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import StratifiedShuffleSplit
from itertools import combinations
from shapely.geometry import Point
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable
#from pysal.viz.splot.libpysal import plot_spatial_weights
#from libpysal import plot_spatial_weights
generator = StratifiedKFold(5,shuffle=True) # 5 fold cv
scale = preprocessing.StandardScaler().fit_transform
def setFont(ax, font, size):
for label in (ax.get_xticklabels() + ax.get_yticklabels()):
label.set_fontname(font)
label.set_fontsize(size)
return ax
======== Table of contents ========
Here we load the census data, and exctract the target variables
Selected Variables:
swissData = pd.read_csv('../Data/swissData.csv') ## 90 variables + BFS Code that identifies the municipalities
tv = ['pop23', 'pop29',
'trpt8','trpt10',
'trv11', 'trv2',
'esp2', 'esp3',
'log2', 'log3',
'fin4', 'fin7']
nameVariables = ['p1','p2',
't1','t2',
'w1','w2',
's1','s2',
'h1','h2',
'e1','e2']
VarDescription=nameVariables
clrs = ['darkred', 'darkred',
'darkgreen', 'darkgreen',
'teal', 'teal',
'y','y',
'dimgray','dimgray',
'sienna', 'sienna']
sd = swissData.filter(tv, axis=1)
sd.columns = nameVariables
sd['t2'] = sd['t2'] / swissData['trpt9'] ## Divide by the total number of commuters
sd['w2'] = sd['w2'] / swissData['pop3'] ## Divide by the number of women
sd['s1'] = sd['s1'] / swissData['esp1'] ## Divide by the total Area
sd['s2'] = sd['s2'] / swissData['esp1'] ## Divide by the total Area
sd['e1'] = sd['e1'] / swissData['fin1'] ## Divide by the total Revenue
sd['e2'] = sd['e2'] / swissData['fin5'] ## Divide by the total Investment
sd['pop1'] = swissData['pop1']
sd['BFS'] = swissData['BFS']
target_variables={
"p1": "Fraction of foreigners",
"p2": "Fraction of beneficiaries of the social assistance",
"t1": "Cars per 1000 inhabitants",
"t2": "Fraction of commuters using public transportation",
"w1": "Unemployment rate",
"w2": "Unemployment rate between women",
"s1": "Building area (%)",
"s2": "Green area (%)",
"h1": "Vacancy rate (%)",
"h2": "Average area per inhabitant in square meters",
"e1": "Municipal debt",
"e2": "Fraction of investment in culture"}
sd.head()
Here we load the Insurance Data, from which we will extract the features or predictors of the various models to predict insurance data
dfFeat = pd.read_csv('../data/featuresList2.csv') ## Features of the paper
dfFeat = dfFeat.sort_values(by=['BFS']).reset_index(drop=True)
X = dfFeat.drop(['BFS'], axis=1) ## remove municipality code
corr_matrix = X.corr().abs()
f, ax = plt.subplots(figsize=(15, 15))
sns.heatmap(corr_matrix, mask=np.zeros_like(corr_matrix, dtype=np.bool),
cmap=sns.diverging_palette(220, 10, as_cmap=True),
square=True, ax=ax)
ax.set_title('Correlation Matrix of the Extracted Features', size=20)
plt.show()
# Find features with correlation greater than 0.95 and we remove if any
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
dfFeat = dfFeat.drop(to_drop, axis=1) # drop the features
nunique = dfFeat.apply(pd.Series.nunique)
cols_to_drop = nunique[nunique == 1].index
dfFeat = dfFeat.drop(cols_to_drop, axis=1)
dfFeat = dfFeat.astype(float)
dfFeatCol = dfFeat.columns ## columns of the featueres
dfFeatCol = list(dfFeatCol.drop(['BFS']))
We define a dictionary that contains all the extracted features
features_dict={"f1" : "unemployment rate",
"f2" : "average age in the muncipality",
"f3" : "Fraction of owners (house)",
"f4" : "Fraction of foreigners",
"f5" : "Avg number of childer for customers with at least one children",
"f6" : "Market Share",
"f7" : "fraction of women",
"f8" : "Number of customers divided by total customers",
"f9" : "average price of the cars",
"f10" : "95th percentile price of the cars",
"f11" : "Average Year of the car",
"f12" : "5th percentile Year of the car",
"f13" : "Average CCM of the car",
"f14" : "95th percentile CCM of the car",
"f15" : "Average number of claims per cars",
"f16" : "95th percentile number of claims of the car",
"f17" : "Average sum of claims of the car",
"f18" : "95th percentile number of price of the car",
"f19" : "Average sum of class premium of the car",
"f20" : "Percent of insured cars",
"f21" : "Average Class of Forniture",
"f22" : "95th percentile class of fornitures",
"f23" : "Avg Number of Rooms",
"f24" : "95th percentile number of rooms",
"f25" : "Average Building Insured Sum",
"f26" : "95th Building Insured Sum",
"f27" : "Average Building Year of Constructions",
"f28" : "5th Percentile Building Year of Constructions",
"f29" : "Average type of Building",
"f30" : "Average number of claims per building",
"f31" : "Average Sum of claims per building",
"f32" : "95th Sum of claims per building",
"f33" : "Average Insured Premium",
"f34" : "95th Sum of Insured Premium"}
feature_names=list(features_dict.keys())
We define a large dataframe that contains the features, the target variables, the coordinates, and all important information. Then we define the categories classes based on the population for the cross-validation
dfData = dfFeat.copy()
dfData = dfData.merge(sd, on ='BFS')
dfData['categories']=0
dfData.loc[dfData['pop1'] > 25000, 'categories'] = 1
dfData.loc[dfData['pop1'] > 80000, 'categories'] = 2
dfData
mun = gpd.read_file('../data/shapeFiles/municipalities.shp')
mun = mun.to_crs({'init': 'epsg:21781'})
mun.drop_duplicates(subset='BFS_NUMMER', keep="last",inplace=True)
mun_ = mun.filter(['BFS_NUMMER', 'geometry'])
cantons = gpd.read_file("../data/shapeFiles/cantons.shp")
cantons = cantons.to_crs({'init': 'epsg:21781'})
dfData = pd.merge(dfData, mun_, left_on ='BFS', right_on='BFS_NUMMER')
dfData = gpd.GeoDataFrame(dfData,geometry='geometry')
coord = pd.read_csv('../data/bfs_name_coordinates.csv',delimiter=',', encoding="iso-8859-1")
dfData = pd.merge(dfData, coord, on = 'BFS')
dfData['g_coords'] = list(zip(dfData['Xcoord'],dfData['Ycoord']))
g_coords = list(dfData.g_coords.values)
### 2.2 Create matrix weights for our analysis
from libpysal.weights import KNN, Kernel
w_knn = KNN.from_dataframe(dfData)
w_gaussian = Kernel.from_dataframe(dfData, function='gaussian')
f, ax = plt.subplots(1,2,figsize=(18,6))
ax0 = ax[0]
ax0 = setFont(ax0, 'Arial', 18)
w_knn.plot(dfData, ax=ax0,
edge_kws=dict(color='r', linestyle=':', linewidth=1),
node_kws=dict(marker=''))
dfData.plot(ax=ax0)
ax0.set_title('KNN', size=20)
ax0.axis('off');
ax1 = ax[1]
ax1 = setFont(ax1, 'Arial', 18)
w_gaussian.plot(dfData, ax=ax1,
edge_kws=dict(color='r', linestyle=':', linewidth=1),
node_kws=dict(marker=''))
dfData.plot(ax=ax1)
ax1.set_title('Gaussian', size=20)
ax1.axis('off');
I = []
for k in target_variables:
y = dfData[k]
mi = ps.explore.esda.Moran(y, w_gaussian)
I.append((k, round(mi.I,2)))
fig, ax = plt.subplots(figsize=(10,5))
for label in (ax.get_xticklabels() + ax.get_yticklabels()):
label.set_fontname('Arial')
label.set_fontsize(15)
g=sns.barplot([x[0] for x in I], [x[1] for x in I], color='midnightblue')
ax.set_ylabel("Moran's I",size = 25)
ax.set_xlabel(' ',size = 25);
We introduce 3 different models:
def run_OLS(df,target_var, predictor_vars):
X,y = extract_scale_features(df,target_var,predictor_vars)
estimator = sm.OLS(y, sm.add_constant(X)).fit()
return estimator
def run_GWR(df,target_var,predictor_vars):
X,y = extract_scale_features(df,target_var,predictor_vars)
g_coords = list(df.g_coords.values)
gwr_bandwidth = Sel_BW(g_coords, y, X, kernel='gaussian', fixed=True).search(criterion='AICc')
estimator = GWR(g_coords, y, X, gwr_bandwidth, kernel='gaussian', fixed=True).fit()
return estimator
def generate_W_knn(df,p=2,k=5):
kd = ps.lib.cg.kdtree.KDTree(df[['Xcoord','Ycoord']].values)
return ps.lib.weights.KNN(kd, p=p, k=k)
W_knn = generate_W_knn(dfData)
def run_SLM(df,target_var, predictor_vars, W = W_knn):
X,y = extract_scale_features(df,target_var,predictor_vars)
estimator = ps.model.spreg.GM_Lag(y, X, w=W, spat_diag=True)
return estimator
def get_r2(est):
try:
return est.R2
except:
pass
try:
return est.rsquared
except:
pass
try:
return r2_score(est.y,est.predy)
except:
return np.nan
def extract_scale_features(df,target_var,predictor_vars):
return scale(df[predictor_vars].values),scale(df[target_var].values.reshape(-1, 1))
def feature_selection(df,target_var,predictor_vars=feature_names):
xNorm,yNorm = extract_scale_features(df,target_var,predictor_vars)
model_bic = LassoLarsIC(criterion='bic',normalize=False)
model_bic.fit(xNorm, ravel(yNorm))
return [predictor_vars[i] for i in np.squeeze(np.nonzero(model_bic.coef_))]
It uses the regression models defined aboove but with as input the census data variables
r2_baseline={}
selected_feats={}
for i,target in enumerate(target_variables):
varnames = feature_selection(dfData,target)
pred = [i for i in target_variables if target[0] not in i]
##run the different models with the selected variables
est_ols = run_OLS(dfData,target,pred)
est_gwr = run_GWR(dfData,target,pred)
if target=='e1':
est_slm = est_ols
else:
est_slm = run_SLM(dfData,target,pred,W_knn)
r2_baseline[target] = [est_ols,est_gwr,est_slm]
selected_feats={}
selected_models={}
g_coords = list(dfData.g_coords.values)
for i,target in enumerate(target_variables):
varnames = feature_selection(dfData,target)
selected_feats[target] = varnames
##run the different models with the selected variables
est_ols = run_OLS(dfData,target,varnames)
est_gwr = run_GWR(dfData,target,varnames)
est_slm = run_SLM(dfData,target,varnames, W = W_knn)
selected_models[target] = [est_ols,est_gwr,est_slm]
for i in selected_feats:
print("--------------------\n")
print("Variable = " + i)
print("Features = " + str(selected_feats[i]))
for feats in selected_feats[i]:
print(features_dict[feats])
import time
import stability as st
def get_current_time():
return int(round(time.time()))
def run_feature_selection(X, y):
model = LassoLarsIC(criterion='bic', normalize=False)
model.fit(X, ravel(y))
return np.nonzero(model.coef_)[0]
for ycol in target_variables:
features_set = []
for niter in range(200):
df_sample = dfData.sample(n=round(0.8*dfFeat.shape[0]))
X = df_sample.filter(dfFeatCol, axis=1) ## features
xNorm = scale(X.values)
y = df_sample[ycol]
y = np.array(y).reshape(-1,1)
yNorm = scale(y)
selected = run_feature_selection(xNorm, yNorm)
values = list(np.zeros(len(dfFeatCol)))
for idx in selected:
values[idx] = 1
features_set.append(values)
Z = np.array(features_set)
stability = st.getStability(Z)
print("%s: %f" %(ycol, stability))
cols = ['target_variable','model','r2']
df_r2=pd.DataFrame(columns=cols)
for i,j in zip(selected_models.keys(),selected_models.values()):
for est, model in zip(j,['OLS','GWR','SLM']):
df_r2=df_r2.append(pd.DataFrame(columns=cols,data=[[i,model,get_r2(est)]]),ignore_index=True)
f, ax = plt.subplots(figsize=(10, 5))
ax = setFont(ax, 'Arial', 18)
tmp = df_r2[df_r2['model']!='OLS']
sns.barplot(data=tmp,hue='model',x='target_variable',y='r2', palette='Set1')
tmp = df_r2[df_r2['model']=='OLS']
ax.plot(tmp['target_variable'], tmp['r2'], color='black', lw=4, label='OLS')
ax.set_ylabel(r'$R^2$', size=20)
ax.set_xlabel(r' ', size=20)
ax.legend(fontsize=14, bbox_to_anchor=(1.01,1))
ax.set_ylim((0,1))
#ax.set_title('Full dataset', size=18)
#f.savefig('../PaperV1/figures/fig3UP.pdf',dpi=300,bbox_inches='tight')
cols = ['target_variable','model','r2']
df_r2_b=pd.DataFrame(columns=cols)
for i,j in zip(r2_baseline.keys(),r2_baseline.values()):
for est, model in zip(j,['OLS','GWR','SLM']):
df_r2_b=df_r2_b.append(pd.DataFrame(columns=cols,data=[[i,model,get_r2(est)]]),ignore_index=True)
f, ax = plt.subplots(figsize=(10, 5))
ax = setFont(ax, 'Arial', 18)
sns.barplot(data=df_r2_b,hue='model',x='target_variable',y='r2')
ax.set_ylabel(r'$R^2$', size=20)
ax.set_xlabel(r' ', size=20)
ax.legend(fontsize=18, bbox_to_anchor=(1.01,1))
ax.set_ylim((0,1))
ax.set_title('Performance of the Baseline models', size=18)
f.savefig('../PaperV1/figures/fig3New.pdf',dpi=300,bbox_inches='tight')
def mergeBaselineResults(df_r2, df_r2_b, col):
tmp = df_r2[df_r2['model']==col]
del(tmp['model'])
tmp.columns = ['target_variable', 'r2_ins']
tmp_b = df_r2_b[df_r2_b['model']==col]
tmp['r2_baseline']=tmp_b['r2']
return tmp
f, axs = plt.subplots(1,3, figsize=(20, 5))
ax0 = axs[0]
ax0 = setFont(ax0, 'Arial', 12)
p1 = mergeBaselineResults(df_r2, df_r2_b, 'OLS')
p1.plot.bar(x= 'target_variable', ax=ax0)
ax0.set_ylim((0,1))
ax0.set_title('OLS', size=14)
ax1 = axs[1]
ax1 = setFont(ax1, 'Arial', 12)
ax1.set_title('GWR', size=14)
p2 = mergeBaselineResults(df_r2, df_r2_b, 'GWR')
p2.plot.bar(x= 'target_variable', ax=ax1)
ax2 = axs[2]
ax2 = setFont(ax2, 'Arial', 12)
ax2.set_title('SLM', size=14)
p3 = mergeBaselineResults(df_r2, df_r2_b, 'SLM')
p3.plot.bar(x= 'target_variable', ax=ax2)
ax2.set_ylim((0,1))
f.suptitle('Comparison between insurance and baseline performance', size=20);
# or plt.suptitle('Main title')
f.savefig('../Response/figures/comparison0.pdf',dpi=300,bbox_inches='tight')
f, ax = plt.subplots(1,3, figsize=(20, 5))
ax0 = ax[0]
ax0 = setFont(ax0, 'Arial', 12)
p1 = mergeBaselineResults(df_r2, df_r2_b, 'OLS')
p1['diff'] = p1['r2_ins']-p1['r2_baseline']
p1.plot.bar(x= 'target_variable', y = 'diff' , ax=ax0, color='navy')
ax0.set_ylim((-.2,.2))
ax0.set_title('OLS', size=14)
ax1 = ax[1]
ax1 = setFont(ax1, 'Arial', 12)
p2 = mergeBaselineResults(df_r2, df_r2_b, 'GWR')
p2['diff'] = p2['r2_ins']-p2['r2_baseline']
p2.plot.bar(x= 'target_variable',y='diff', ax=ax1, color='navy')
ax1.set_ylim((-.2,.2))
ax1.set_title('GWR', size=14)
ax2 = ax[2]
ax2 = setFont(ax2, 'Arial', 12)
p3 = mergeBaselineResults(df_r2, df_r2_b, 'SLM')
p3['diff'] = p3['r2_ins']-p3['r2_baseline']
p2.plot.bar(x= 'target_variable',y='diff', ax=ax2, color='navy')
ax2.set_ylim((-.2,.2))
ax2.set_title('SLM', size=14)
f.suptitle('Difference between insurance and baseline performance', size=20);# or plt.suptitle('Main title')
f.savefig('../Response/figures/comparison.pdf',dpi=300,bbox_inches='tight')
f, ax = plt.subplots(1,2, figsize=(20, 5))
ax1 = ax[0]
ax1 = setFont(ax1, 'Arial', 20)
p2 = mergeBaselineResults(df_r2, df_r2_b, 'GWR')
p2['diff'] = p2['r2_ins']-p2['r2_baseline']
p2.plot.bar(x= 'target_variable',y='diff', ax=ax1, color = sns.color_palette("Set1", 1)[0])
ax1.set_ylim((-.2,.2))
ax1.set_title('GWR', size=22)
ax1.set_xlabel(' ')
ax1.set_ylabel(r'$\Delta R^2$', size=22)
ax1.get_legend().remove()
ax2 = ax[1]
ax2 = setFont(ax2, 'Arial', 20)
p3 = mergeBaselineResults(df_r2, df_r2_b, 'SLM')
p3['diff'] = p3['r2_ins']-p3['r2_baseline']
p3.plot.bar(x= 'target_variable',y='diff', ax=ax2, color = sns.color_palette("Set1", 2)[1])
ax2.set_ylim((-.2,.2))
ax2.set_title('SLM', size=22)
ax2.set_xlabel(' ')
ax2.set_ylabel(r'$\Delta R^2$', size=22)
ax2.get_legend().remove()
f.savefig('../PaperV1/figures/newFigure4.pdf',dpi=300,bbox_inches='tight')
def compute_conf_slm(model):
alpha = 0.05
df_resid = model.n - model.k
q = scipy.stats.t.ppf(1 - alpha / 2, df_resid)
conf_low,conf_high=[],[]
for i in range(len(model.betas)):
conf_low.append(model.betas[i,0] - q * model.std_err[i])
conf_high.append(model.betas[i,0] + q * model.std_err[i])
return conf_low,conf_high
ols_stats,gwr_stats,slm_stats={},{},{}
for vv in selected_models:
est_ols,est_gwr,est_slm=selected_models[vv]
cols=['features', 'pvalue','coef','conf_inter','std_error']
##ols
dfaux=pd.DataFrame()
dfaux['features'] = ['Intercept']+selected_feats[vv]
dfaux['pvalue'] = est_ols.pvalues
dfaux['coef'] = est_ols.params
dfaux['conf_inter_low']=est_ols.conf_int()[:,0]
dfaux['conf_inter_high']=est_ols.conf_int()[:,1]
dfaux['std_err'] = est_ols.bse
dfaux['tvalues'] = est_ols.tvalues
ols_stats[vv]=dfaux
#gwr
dfaux=pd.DataFrame()
dfaux['features'] = ['Intercept']+selected_feats[vv]
dfaux['coef_mean'] = np.mean(est_gwr.params,axis=0)
dfaux['coef_std'] = np.std(est_gwr.params,axis=0)
dfaux['coef_min'] = np.min(est_gwr.params,axis=0)
dfaux['coef_median'] = np.median(est_gwr.params,axis=0)
dfaux['coef_max'] = np.max(est_gwr.params,axis=0)
dfaux['std_err_mean'] = np.mean(est_gwr.bse,axis=0)
dfaux['std_err_std'] = np.std(est_gwr.bse,axis=0)
dfaux['std_err_min'] = np.min(est_gwr.bse,axis=0)
dfaux['std_err_median'] = np.median(est_gwr.bse,axis=0)
dfaux['std_err_max'] = np.max(est_gwr.bse,axis=0)
gwr_stats[vv]=dfaux
# slm
dfaux=pd.DataFrame()
dfaux['features'] = ['Intercept']+selected_feats[vv]+['W_dep_var']
dfaux['coef'] = est_slm.betas
dfaux['std_err'] = est_slm.std_err
cf_low,cf_high = compute_conf_slm(est_slm)
dfaux['conf_inter_low'] = cf_low
dfaux['conf_inter_high'] = cf_high
dfaux['z_stat'] = np.array(est_slm.z_stat)[:,0]
dfaux['probability'] = np.array(est_slm.z_stat)[:,1]
slm_stats[vv]=dfaux
ols_stats['p1']
gwr_stats['p1']
slm_stats['p1']
Lets look at the selected features for each variable and double-check that the regressor for outcome X doesn’t involve explanatory variables that are related to X in an obvious way
Obvious relationships:
- 'Fraction of foreigners'
Fraction of foreigners
- 'Fraction of beneficiaries of the social assistance'
unemployment rate
- 'Unemployment rate'
unemployment rate
- 'Unemployment rate between women'
unemployment rate
- 'Vacancy rate (%)'
Fraction of owners (house)
Variables without obvious relathinships:
- 'Building area (%)'
- 'Green area (%)'
- 'Average area per inhabitant in square meters'
- 'Municipal debt'
- 'Fraction of investment in culture'
- 'Cars per 1000 inhabitants'
- 'Fraction of commuters using public transportation'
Now we can asses permormance in inreasing complexity models for the variables that have obvious relationships with some features
# Selected features the variables we want to further explore
assess_variables={'p1':['f4', 'f3', 'f7', 'f23'],
'p2':['f1', 'f9', 'f13', 'f15', 'f23'],
'w1':['f1', 'f4', 'f7', 'f13', 'f23'],
'w2':['f1', 'f4', 'f6', 'f7', 'f9', 'f16', 'f19', 'f23', 'f33'],
'h1':['f3', 'f20']}
r2_increasing_complexity={}
for vv in assess_variables:
cols=['OLS','GWR','SLM']
df_tmp=pd.DataFrame(columns=cols)
print('-------'+vv+'---------')
varnames=[]
for i in assess_variables[vv]:
varnames.append(i)
est_gwr = run_GWR(dfData,vv,varnames)
est_ols = run_OLS(dfData,vv,varnames)
est_slm = run_SLM(dfData,vv,varnames)
aux=[get_r2(est_ols),get_r2(est_gwr),get_r2(est_slm)]
df_tmp = df_tmp.append(pd.DataFrame(columns=cols,data=[aux]),ignore_index=True)
r2_increasing_complexity[vv]=df_tmp
for vv in assess_variables:
fig,ax=plt.subplots()
sns.lineplot(data=r2_increasing_complexity[vv])
ax.set_title(vv)
ax.set_xticks(range(len(assess_variables[vv])))
ax.set_xticklabels(assess_variables[vv])
def cross_validation(df,case,selected_feats=selected_feats):
tmp=[]
for train_index, test_index in generator.split(df, df['categories']):
df_train = df.iloc[train_index]
df_test = df.iloc[test_index]
c_test=list(df_test.g_coords)
for vv in selected_feats:
Xtrain,ytrain = extract_scale_features(df_train,vv,selected_feats[vv])
Xtest,ytest = extract_scale_features(df_test,vv,selected_feats[vv])
if case == 'GWR':
est_gwr = run_GWR(df_train,vv,selected_feats[vv])
yest = np.sum(sm.add_constant(Xtrain) * est_gwr.params,
axis=1).reshape((-1, 1))
# predict in the testing locations
Res = GWR.predict(est_gwr.model, np.array(c_test), np.array(ytest) )
ypred = np.sum(sm.add_constant(Xtest) * Res.params, axis=1).reshape((-1, 1))
tmp.append([vv,r2_score(ytrain, yest),r2_score(ytest,ypred)]) ## training r squared
if case == 'OLS':
est_ols = run_OLS(df_train,vv,selected_feats[vv])
yest = est_ols.predict(sm.add_constant(Xtrain))
ypred = est_ols.predict(sm.add_constant(Xtest))
tmp.append([vv,r2_score(ytrain, yest),r2_score(ytest,ypred)]) ## training r squared
if case == 'SLM':
est_slm = run_SLM(df_train,vv,selected_feats[vv],W = generate_W_knn(df_train))
yW = [float(np.dot(generate_W_knn(df_train.append([df_test.iloc[i]])).full()[0][-1,:][:-1],ytrain))
for i in range(df_test.shape[0])]
x_aux = np.hstack([sm.add_constant(Xtest),np.array(yW).reshape(-1,1)])
yest = est_slm.predy
ypred = np.matmul(x_aux , est_slm.betas)
tmp.append([vv,r2_score(ytrain, yest),r2_score(ytest,ypred)])
return pd.DataFrame(columns=['target_variable','r2Tr','r2Val'],data=tmp)
for case in ['OLS','GWR','SLM']:
df_crossval=cross_validation(dfData,case)
r2Std=df_crossval.groupby('target_variable',sort=False).apply(np.std).values
r2Mean=df_crossval.groupby('target_variable',sort=False).apply(mean).values
f, axs = plt.subplots(1,2, figsize=(20, 5))
for i, case in enumerate(['GWR','SLM']):
ax = axs[i]
xt = np.arange(0,len(target_variables.keys()))
ax = setFont(ax, 'Arial', 20)
width = 0.3
if i == 0:
c1 = sns.color_palette("Set1", 1)[0]
c2 = sns.color_palette("Set1", 8)[2]
c2 = 'salmon'
if i == 1:
c1 = sns.color_palette("Set1", 2)[1]
c2 = sns.color_palette("Set1", 8)[2]
c2 = 'skyblue'
p0 = ax.bar(xt-width/2, r2Mean[:,0], width, align='center', yerr=r2Std[:,0],label='Training', color=c1)
p1 = ax.bar(xt+width/2, r2Mean[:,1], width, align='center', yerr=r2Std[:,1],label='Validation', color=c2)
p2 = ax.plot(xt,df_r2[df_r2.model==case].r2.values, lw=0, marker='v',
markersize=10, color='black',label='Full Dataset')
ax.set_xticks(xt)
ax.set_xticklabels(list(target_variables.keys()))
#ax.set_xticks(xt,list(target_variables.keys()))
ax.legend(fontsize=18, bbox_to_anchor=(1.1, 1.2))
ax.set_ylim([0,1]);
ax.set_title(case, size = 22);
ax.set_ylabel('$R^2$', size = 22);
#if i ==0:
# ax.get_legend().remove()
#f.savefig('../PaperV1/figures/figur3_new_bottom.pdf',dpi=300,bbox_inches='tight')
varnames = feature_selection(dfData,'t2')
print('Predictors for Public Transportation')
for i in varnames:
print(i+': '+features_dict[i])
ols_stats['t2']
gwr_stats['t2']
slm_stats['t2']
est_gwr = selected_models['t2'][1]
dfc = gpd.GeoDataFrame(data={'BFS': dfData.BFS,'geometry':dfData.geometry, 'clr': ravel(est_gwr.localR2)})
m=30 #Custom Legend
legend_elements = [Line2D([0], [0], marker='o', color='w', label='0.43 - 0.57',markerfacecolor='lightblue', markersize=m),
Line2D([0], [0], marker='o', color='w', label='0.57 - 0.66',markerfacecolor='green', markersize=m),
Line2D([0], [0], marker='o', color='w', label='0.66 - 0.73',markerfacecolor='orange', markersize=m),
Line2D([0], [0], marker='o', color='w', label='0.73 - 0.82',markerfacecolor='purple', markersize=m),
Line2D([0], [0], marker='o', color='w', label='0.82 - 0.91',markerfacecolor='brown', markersize=m)]
d = {'fontsize':40, 'loc':2}
m = {'markersize':30}
fig, ax=plt.subplots(figsize=(35, 35), facecolor='white', edgecolor='white')
for label in (ax.get_xticklabels() + ax.get_yticklabels()):
label.set_fontname('Arial')
label.set_fontsize(40)
cantons.plot(ax = ax, color='white', edgecolor='black')
pt= dfc.plot(column='clr', edgecolor='black', markersize=40,
legend=False, scheme='fisher_jenks',cmap='Paired', ax = ax, legend_kwds= d)
ax.spines["right"].set_visible(False)
ax.spines["left"].set_visible(False)
ax.spines["top"].set_visible(False)
ax.spines["bottom"].set_visible(False)
ax.set_xticks([])
ax.set_yticks([]);
ax.legend(handles=legend_elements, loc=2,fontsize=40)
ax.set_title('Local $R^2$', size=65)
plt.show()
def plot_map(clr,title):
mun_color = pd.merge(mun,pd.DataFrame(data={'BFS': dfFeat.BFS, 'clr': clr}),left_on='BFS_NUMMER',right_on='BFS')
fig, ax=plt.subplots(figsize=(12, 7), facecolor='white', edgecolor='white')
for label in (ax.get_xticklabels() + ax.get_yticklabels()):
label.set_fontname('Arial')
label.set_fontsize(12)
cantons.plot(ax = ax, color='white', edgecolor='black')
mun_color.plot(ax = ax, column='clr', edgecolor='black',legend=True, scheme='fisher_jenks',cmap='RdYlBu_r')
ax.set_ylabel('Y [km]',size = 14 ,rotation=90)
ax.set_xlabel('X [km]',size= 14)
ax.set_xticklabels(np.round(ax.get_xticks()/1000))
ax.set_yticklabels(np.round(ax.get_yticks()/1000))
plt.title(title,fontsize=16)
plt.show()
return fig
def annotate(target_point,text_location,text):
plt.annotate(text, target_point, text_location, 'data', \
arrowprops=dict(arrowstyle="-|>", \
connectionstyle="angle3", lw=1), \
size=16, ha="center")
mu=np.mean(dfData['t2'])
std=np.std(dfData['t2'])
tmp=dfData.filter(items=['BFS','t2','CityLaMobiliere'])
tmp['t2_pred']=100*ravel(est_gwr.predy*std+mu) #Predicted t2 units [%]
tmp['t2']=100*tmp['t2'] #Actual t2 units [%]
fig=plot_map(tmp['t2'],'Actual t2')
fig=plot_map(tmp['t2_pred'],'Predicted t2')
#fig.savefig('../Paper/figures/predt2.pdf',dpi=300,bbox_inches='tight')
fig=plot_map(ravel(est_gwr.localR2),'Local R$^2$')
#fig.savefig('../Paper/figures/localr2.pdf',dpi=300,bbox_inches='tight')
f, ax = plt.subplots(figsize=(7, 7))
ax = setFont(ax, 'Arial', 18)
ax.scatter(tmp['t2'],tmp['t2_pred'])
city_annotate=['Zürich','Genève','Bern','Basel','Lausanne','Lugano','Bellinzona']
#hd=[0,0,0,-5,5] # displacement for annotation with arrows
#vd=[-5,-15,-5,-5,-5]
hd=[-3,0,0,0,0,0,0,0.5] #displacement for only text annotation
vd=[0.5,0,-1,0,0,0,-1,-0.5]
for j,city in enumerate(city_annotate):
aux=tmp[tmp['CityLaMobiliere']==city]
#annotate((aux['t2'],aux['t2_pred']),(aux['t2']+hd[j],aux['t2_pred']+vd[j]),city)
plt.text(aux['t2']+hd[j],aux['t2_pred']+vd[j],city,fontsize=18)
plt.text(10,50,"R$^2$ = "+str(round(r2_score(tmp['t2'],tmp['t2_pred']),2)),fontsize=20)
plt.ylabel("Predicted t2", fontsize=20)
plt.xlabel("Actual t2", fontsize=20)
plt.show()
We predict those six target variables by removing the trivial features
target_variables_={
"p1": "Fraction of foreigners",
"p2": "Fraction of beneficiaries of the social assistance",
"t1": "Cars per 1000 inhabitants",
"t2": "Fraction of commuters using public transportation",
"w1": "Unemployment rate",
"w2": "Unemployment rate between women"}
features_dict={"f1" : "unemployment rate",
"f2" : "average age in the muncipality",
"f3" : "Fraction of owners (house)",
"f4" : "Fraction of foreigners",
"f5" : "Avg number of childer for customers with at least one children",
"f6" : "Market Share",
"f7" : "fraction of women",
"f8" : "Number of customers divided by total customers",
"f9" : "average price of the cars",
"f10" : "95th percentile price of the cars",
"f11" : "Average Year of the car",
"f12" : "5th percentile Year of the car",
"f13" : "Average CCM of the car",
"f14" : "95th percentile CCM of the car",
"f15" : "Average number of claims per cars",
"f16" : "95th percentile number of claims of the car",
"f17" : "Average sum of claims of the car",
"f18" : "95th percentile number of price of the car",
"f19" : "Average sum of class premium of the car",
"f20" : "Percent of insured cars",
"f21" : "Average Class of Forniture",
"f22" : "95th percentile class of fornitures",
"f23" : "Avg Number of Rooms",
"f24" : "95th percentile number of rooms",
"f25" : "Average Building Insured Sum",
"f26" : "95th Building Insured Sum",
"f27" : "Average Building Year of Constructions",
"f28" : "5th Percentile Building Year of Constructions",
"f29" : "Average type of Building",
"f30" : "Average number of claims per building",
"f31" : "Average Sum of claims per building",
"f32" : "95th Sum of claims per building",
"f33" : "Average Insured Premium",
"f34" : "95th Sum of Insured Premium"}
selected_feats={}
selected_models={}
g_coords = list(dfData.g_coords.values)
for i,target in enumerate(target_variables):
varnames = feature_selection(dfData,target)
if target == 'p1':
varnames.remove('f4')
elif target == 'p2':
varnames.remove('f1')
elif target == 't1':
varnames.remove('f20')
elif target == 't2':
varnames.remove('f3')
elif target == 'w1':
varnames.remove('f1')
elif target == 'w2':
varnames.remove('f1')
selected_feats[target] = varnames
##run the different models with the selected variables
est_ols = run_OLS(dfData,target,varnames)
est_gwr = run_GWR(dfData,target,varnames)
est_slm = run_SLM(dfData,target,varnames, W = W_knn)
selected_models[target] = [est_ols,est_gwr,est_slm]
cols = ['target_variable','model','r2']
df_r2_trivial=pd.DataFrame(columns=cols)
for i,j in zip(selected_models.keys(),selected_models.values()):
for est, model in zip(j,['OLS','GWR','SLM']):
df_r2_trivial=df_r2_trivial.append(pd.DataFrame(columns=cols,
data=[[i,model,get_r2(est)]]),
ignore_index=True)
f, ax = plt.subplots(figsize=(10, 5))
ax = setFont(ax, 'Arial', 18)
sns.barplot(data=df_r2_trivial,hue='model',x='target_variable',y='r2')
ax.set_ylabel(r'$R^2$', size=20)
ax.set_xlabel(r' ', size=20)
ax.legend(fontsize=18, bbox_to_anchor=(1.01,1))
ax.set_ylim((0,1))
ax.set_title('Performance of the models witohout trivial features', size=18)
df_r2_trivial = df_r2_trivial[df_r2_trivial.index < 18]
tmp = df_r2[df_r2.index < 18]
df_r2_trivial['diff'] = tmp['r2'] - df_r2_trivial['r2']
f, ax = plt.subplots(figsize=(10, 5))
ax = setFont(ax, 'Arial', 18)
tmp = df_r2_trivial[df_r2_trivial['model']!='OLS']
sns.barplot(data=tmp,hue='model',x='target_variable',y='diff')
ax.set_ylabel(r'$\Delta R^2$', size=20)
ax.set_xlabel(r' ', size=20)
ax.legend(fontsize=18, bbox_to_anchor=(1.01,1))
ax.set_title('Performances variation due to the removal of trivial features', size=18)
#f.savefig('../PaperV1/figures/deltaR2_new_a.pdf',dpi=300,bbox_inches='tight')
yCol = sd.columns[:-2]
from shapely.geometry import Point
cords_aux = list(zip(dfData['Xcoord'].values,dfData['Ycoord'].values))
df_res = gpd.GeoDataFrame()
df_res['geometry']=[Point(i) for i in cords_aux]
df_res.plot()
plt.show()
tmp=[]
for idx, ycol in enumerate(yCol):
yNorm = scale(np.array(dfData[ycol]).reshape(-1,1) )
varnames = selected_feats[ycol]
X = dfData.filter(varnames, axis=1) ## features
xNorm = scale(X.values)
est = sm.OLS(yNorm, sm.add_constant(xNorm)).fit()
df_res[ycol+'_res'] = est.resid
mi=ps.explore.esda.Moran(est.resid,w_gaussian,permutations=9999)
tmp.append(mi.I)
tmp.append(mi.p_sim)
mi=ps.explore.esda.Moran(yNorm,w_gaussian,permutations=9999)
tmp.append(mi.I)
tmp.append(mi.p_sim)
pd.set_option('display.float_format', lambda x: '%.4f' % x)
df_mori=pd.DataFrame(data=np.reshape(tmp,(-1,4)),columns=['I_res','Pval_res','I_var','Pval_var'],index=yCol)
df_mori
a = np.zeros(shape=(len(yCol),len(yCol)))
for i, v in enumerate(yCol):
for j, k in enumerate(yCol):
y1 = dfData[v]
y2 = dfData[k]
a[i][j] = scipy.stats.pearsonr( np.array(y1.values), np.array(y2.values) )[0]
f, ax = plt.subplots(figsize=(10, 10))
sns.heatmap(a, mask=np.zeros_like(a, dtype=np.bool), annot=True,
cmap=sns.diverging_palette(220, 10, as_cmap=True),
square=True, ax=ax)
plt.xticks(np.arange(len(yCol))+0.5, yCol);
plt.yticks(np.arange(len(yCol))+0.5, yCol);
plt.show()
swissData = pd.read_csv('../Data/swissData.csv') ## 90 variables + BFS Code that identifies the municipalities
dfFeat = pd.read_csv('../data/featuresList2.csv') ## Features of the paper
dfFeat = dfFeat.sort_values(by=['BFS']).reset_index(drop=True)
popSwiss = swissData.filter(['pop1', 'BFS'])
popSwiss = popSwiss.sort_values(['pop1'], ascending=False)
mun = gpd.read_file('../data/shapeFiles/municipalities.shp')
#mun = mun.to_crs({'init': 'epsg:2056'})
mun = mun.to_crs({'init': 'epsg:21781'}) #change coordinate system CH1903
mun = mun.sort_values(by='BFS_NUMMER')
mun = mun.reset_index(drop=True)
mun1 = mun.filter(['BFS_NUMMER', 'GEM_FLAECH', 'NAME'])
mun1.columns = ['BFS', 'AREA', 'NAME']
mun1 = pd.merge(popSwiss, mun1, on='BFS')
mun1['popDensity']= mun1.pop1/mun1.AREA
ms = dfFeat.filter(['BFS', 'f6'])
mun1 = pd.merge(mun1,ms, on='BFS')
f, ax = plt.subplots(figsize=(6.5, 6.5))
ax = setFont(ax, 'Arial', 12)
plt.scatter(mun1.f6, mun1.popDensity)
ax.set_ylabel('Population Density - '+'Habitants/'r'$km^2$', size=14)
ax.set_xlabel('Market Share (%)', size=14)
We reporte the performance of our model in comparison different cities' charachterics, such as population, market share and population density. We compare the local R2 of the GWR with the cities' carachterics
y = dfData['pop1'].values
c1 = 0
c2 = 0
f, axs = plt.subplots(2,6,figsize=(20,10))
g_coords = list(dfData.g_coords.values)
for i,target in enumerate(target_variables):
varnames = feature_selection(dfData,target)
##run the different models with the selected variables
est_gwr = run_GWR(dfData,target,varnames)
ax = axs[c1][c2]
ax = setFont(ax, 'Arial', 20)
ax.scatter(est_gwr.localR2, y)
ax.set_yscale('log')
ax.set_xscale('log')
ax.set_title('Feature '+target, size=24)
c2+=1
if c2 > 5:
c2 = 0
c1 += 1
f.suptitle('Performance versus population', size=40);
#f.savefig('../Response/figures/ppop.pdf',dpi=300,bbox_inches='tight')
y = dfData['f6'].values
c1 = 0
c2 = 0
f, axs = plt.subplots(2,6,figsize=(20,10))
g_coords = list(dfData.g_coords.values)
for i,target in enumerate(target_variables):
varnames = feature_selection(dfData,target)
##run the different models with the selected variables
est_gwr = run_GWR(dfData,target,varnames)
ax = axs[c1][c2]
ax = setFont(ax, 'Arial', 20)
ax.scatter(est_gwr.localR2, y)
ax.set_yscale('log')
ax.set_xscale('log')
ax.set_title('Feature '+target, size=24)
c2+=1
if c2 > 5:
c2 = 0
c1 += 1
f.suptitle('Performance versus market share', size=40);
#f.savefig('../Response/figures/pmshare.pdf',dpi=300,bbox_inches='tight')
### Peromance and population density
mun2 = pd.merge(dfData, mun1, on = 'BFS')
mun2 = mun2.drop_duplicates(subset='BFS')
y = mun2.popDensity.values
c1 = 0
c2 = 0
f, axs = plt.subplots(2,6,figsize=(20,10))
g_coords = list(dfData.g_coords.values)
for i,target in enumerate(target_variables):
varnames = feature_selection(dfData,target)
##run the different models with the selected variables
est_gwr = run_GWR(dfData,target,varnames)
ax = axs[c1][c2]
ax = setFont(ax, 'Arial', 20)
ax.scatter(est_gwr.localR2, y)
ax.set_yscale('log')
ax.set_xscale('log')
ax.set_title('Feature '+target, size=24)
c2+=1
if c2 > 5:
c2 = 0
c1 += 1
f.suptitle('Performance versus population density', size=40);
#f.savefig('../Response/figures/popDens.pdf',dpi=300,bbox_inches='tight')